Here we systematically look at the variability in mRNA levels in order to compare to the variability at the small RNA level. We use two measurements as a baseline for variability: (1) technical noise calculated from a spike-in S. pombe total RNA sample (10%) (2) the noise-mean relationship calculated from the C. elegans data
library(ggplot2)
library(MASS)
library(viridis)
## Loading required package: viridisLite
library(msir)
## Package 'msir' version 1.3.2
## Type 'citation("msir")' for citing this R package in publications.
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(RColorBrewer)
library(ggpubr)
## Loading required package: magrittr
library(OneR)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
setwd("/Users/ab6415/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES/06_Figure4")
cel_gene_names<-read.table("/Users/ab6415/Bioinformatics/Analysis/EpiMALs_PAPER_ANALYSES/01_Raw_counts/RAW_RNAseq_COUNTS/gene_names")
nrow(cel_gene_names)
## [1] 20325
#gen 25 samples
RNAseq_counts_gen25<-read.table("../01_Raw_counts/RAW_RNAseq_with_pombe_spike-in/RNA-seq_25thgen_pombe_spike-in.txt")
RNAseq_counts_gen25$V1<-as.character(RNAseq_counts_gen25$V1)
RNAseq_counts_gen25[1:20320,"V1"]<-as.character(cel_gene_names$V1[1:20320])
which(table(RNAseq_counts_gen25$V1)>1)
## SPAC16E8.18 SPBC3B8.10
## 13228 16433
RNAseq_counts_gen25[which(RNAseq_counts_gen25$V1=="SPAC16E8.18"),]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 20749 SPAC16E8.18 82 38 39 57 50 62 45 43 51 81 53 79
## 20750 SPAC16E8.18 123 54 70 70 63 76 63 62 79 75 70 87
RNAseq_counts_gen25[which(RNAseq_counts_gen25$V1=="SPBC3B8.10"),]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13
## 23943 SPBC3B8.10 0 0 0 0 0 0 0 0 0 0 0 0
## 23944 SPBC3B8.10 149 108 89 108 99 133 106 97 102 76 115 151
RNAseq_counts_gen25<-RNAseq_counts_gen25[-c(20749,23943),]
rownames(RNAseq_counts_gen25)<-RNAseq_counts_gen25$V1
RNAseq_counts_gen25$V1<-NULL
RNAseq_counts_gen25<-RNAseq_counts_gen25[,1:11]
colnames(RNAseq_counts_gen25)<-c("A","B","C","D","F","G","H","I","J","K","L")
spo_genes<-read.table("spo_genes_form")
colSums(RNAseq_counts_gen25)
## A B C D F G H I
## 46203428 42289790 39508339 39664129 41555108 44410973 42587690 45283325
## J K L
## 45219620 39861844 42620666
colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
## A B C D F G H I J K
## 5228848 3498817 3260808 3951042 3164044 3781192 3475598 3700445 3764060 5058339
## L
## 3969964
colSums(RNAseq_counts_gen25)-colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
## A B C D F G H I
## 40974580 38790973 36247531 35713087 38391064 40629781 39112092 41582880
## J K L
## 41455560 34803505 38650702
colSums(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
## A B C D F G H I
## 40974580 38790973 36247531 35713087 38391064 40629781 39112092 41582880
## J K L
## 41455560 34803505 38650702
colSums(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])/colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
## A B C D F G H I
## 7.836254 11.086883 11.116119 9.038903 12.133543 10.745231 11.253342 11.237265
## J K L
## 11.013523 6.880422 9.735781
barplot(rbind(colSums(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
,colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])),ylab="total reads",
main="gen 25 samples")
barplot(prop.table(rbind(colSums(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])
,colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),])),margin=2),ylab="fraction of reads", main="gen 25 samples")
##Generation 100 samples
RNAseq_counts_gen100<-read.table("../01_Raw_counts/RAW_RNAseq_with_pombe_spike-in/RNA-seq_100thgen_pombe_spike-in.txt")
RNAseq_counts_gen100$V1<-as.character(RNAseq_counts_gen100$V1)
RNAseq_counts_gen100[1:20320,"V1"]<-as.character(cel_gene_names$V1[1:20320])
which(table(RNAseq_counts_gen100$V1)>1)
## SPAC16E8.18 SPBC3B8.10
## 13228 16433
RNAseq_counts_gen100[which(RNAseq_counts_gen100$V1=="SPAC16E8.18"),]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 20749 SPAC16E8.18 72 108 151 96 79 95 77 55 110 71 67
## 20750 SPAC16E8.18 103 138 214 113 119 126 121 94 157 115 113
RNAseq_counts_gen100[which(RNAseq_counts_gen100$V1=="SPBC3B8.10"),]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 23943 SPBC3B8.10 0 0 0 0 0 0 0 0 0 0 0
## 23944 SPBC3B8.10 195 341 504 243 199 273 241 144 294 293 241
RNAseq_counts_gen100<-RNAseq_counts_gen100[-c(20749,23943),]
rownames(RNAseq_counts_gen100)<-RNAseq_counts_gen100$V1
RNAseq_counts_gen100$V1<-NULL
colnames(RNAseq_counts_gen100)<-c("A","B","C","D","F","G","H","I","J","K","L")
spo_genes<-read.table("spo_genes_form")
colSums(RNAseq_counts_gen100)
## A B C D F G H I
## 61031908 63438891 63724974 64945984 61447685 68576241 63782267 67220074
## J K L
## 61886376 63269400 69029404
colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
## A B C D F G H I
## 6216996 7922234 13333421 6264308 5890998 6596934 6117166 4635487
## J K L
## 7676646 6363830 6240601
colSums(RNAseq_counts_gen100)-colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
## A B C D F G H I
## 54814912 55516657 50391553 58681676 55556687 61979307 57665101 62584587
## J K L
## 54209730 56905570 62788803
colSums(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
## A B C D F G H I
## 54814912 55516657 50391553 58681676 55556687 61979307 57665101 62584587
## J K L
## 54209730 56905570 62788803
colSums(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])/colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
## A B C D F G H I
## 8.816945 7.007702 3.779342 9.367623 9.430777 9.395169 9.426767 13.501189
## J K L
## 7.061643 8.942032 10.061339
barplot(rbind(colSums(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
,colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])),ylab="total reads",
main="gen 100 samples")
barplot(prop.table(rbind(colSums(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])
,colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),])),margin=2),ylab="fraction of reads",
main="gen 100 samples")
In order to have the same fraction of pombe reads and C. elegans reads in each sample, and the same depth of sequencing in each sample, we subsample the datasets to the minumum number of C. elegans reads and pombe reads.
min(colSums(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),]))
## [1] 3164044
min(colSums(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),]))
## [1] 34803505
min(colSums(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),]))
## [1] 4635487
min(colSums(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),]))
## [1] 50391553
#function to subsample read counts from the initial table
subsample_counts<-function(df,numcounts){
final_subsampled_df<-data.frame(rownames(df))
colnames(final_subsampled_df)<-"gene_id"
for (col in colnames(df)){
subsample<-sample(x = rownames(df),
prob = df[,col]/sum(df[,col]),
size = numcounts,replace = TRUE)
subsample<-as.data.frame(table(subsample))
colnames(subsample)<-c("gene_id",col)
final_subsampled_df<-merge(final_subsampled_df,subsample,by="gene_id",all=TRUE)
}
final_subsampled_df[is.na(final_subsampled_df)]<-0
return(final_subsampled_df)
}
#gen 25
spo_subsampled<-subsample_counts(RNAseq_counts_gen25[which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),],3164044)
colSums(spo_subsampled[,2:12])
## A B C D F G H I J K
## 3164044 3164044 3164044 3164044 3164044 3164044 3164044 3164044 3164044 3164044
## L
## 3164044
cel_subsampled<-subsample_counts(RNAseq_counts_gen25[-which(rownames(RNAseq_counts_gen25) %in% spo_genes$V1),],34803505)
colSums(cel_subsampled[,2:12])
## A B C D F G H I
## 34803505 34803505 34803505 34803505 34803505 34803505 34803505 34803505
## J K L
## 34803505 34803505 34803505
RNAseq_counts_gen25<-rbind(cel_subsampled,spo_subsampled)
rownames(RNAseq_counts_gen25)<-RNAseq_counts_gen25$gene_id
RNAseq_counts_gen25$gene_id<-NULL
cor(RNAseq_counts_gen25)
## A B C D F G H
## A 1.0000000 0.9950640 0.9926725 0.9840526 0.9902218 0.9903346 0.9947593
## B 0.9950640 1.0000000 0.9877657 0.9640694 0.9957587 0.9946395 0.9938130
## C 0.9926725 0.9877657 1.0000000 0.9802214 0.9832098 0.9853517 0.9873573
## D 0.9840526 0.9640694 0.9802214 1.0000000 0.9570576 0.9591008 0.9726893
## F 0.9902218 0.9957587 0.9832098 0.9570576 1.0000000 0.9986827 0.9948443
## G 0.9903346 0.9946395 0.9853517 0.9591008 0.9986827 1.0000000 0.9965757
## H 0.9947593 0.9938130 0.9873573 0.9726893 0.9948443 0.9965757 1.0000000
## I 0.9928759 0.9962749 0.9833667 0.9659041 0.9965801 0.9964547 0.9974838
## J 0.9895687 0.9760557 0.9948309 0.9905826 0.9697467 0.9733477 0.9809989
## K 0.8312619 0.7840272 0.8383389 0.9052394 0.7799795 0.7881201 0.8168273
## L 0.9893885 0.9953329 0.9816514 0.9564187 0.9989974 0.9990155 0.9955407
## I J K L
## A 0.9928759 0.9895687 0.8312619 0.9893885
## B 0.9962749 0.9760557 0.7840272 0.9953329
## C 0.9833667 0.9948309 0.8383389 0.9816514
## D 0.9659041 0.9905826 0.9052394 0.9564187
## F 0.9965801 0.9697467 0.7799795 0.9989974
## G 0.9964547 0.9733477 0.7881201 0.9990155
## H 0.9974838 0.9809989 0.8168273 0.9955407
## I 1.0000000 0.9731660 0.7979283 0.9978300
## J 0.9731660 1.0000000 0.8719114 0.9683977
## K 0.7979283 0.8719114 1.0000000 0.7802030
## L 0.9978300 0.9683977 0.7802030 1.0000000
cor(spo_subsampled[,2:12])
## A B C D F G H
## A 1.0000000 0.9994730 0.9994287 0.9952281 0.9996825 0.9996672 0.9996516
## B 0.9994730 1.0000000 0.9989122 0.9931777 0.9996243 0.9995846 0.9992465
## C 0.9994287 0.9989122 1.0000000 0.9968090 0.9991559 0.9991732 0.9991390
## D 0.9952281 0.9931777 0.9968090 1.0000000 0.9941671 0.9942080 0.9949992
## F 0.9996825 0.9996243 0.9991559 0.9941671 1.0000000 0.9997227 0.9995649
## G 0.9996672 0.9995846 0.9991732 0.9942080 0.9997227 1.0000000 0.9996401
## H 0.9996516 0.9992465 0.9991390 0.9949992 0.9995649 0.9996401 1.0000000
## I 0.9989099 0.9985020 0.9993769 0.9963505 0.9987255 0.9988668 0.9987788
## J 0.9992499 0.9984263 0.9994608 0.9972687 0.9989145 0.9989707 0.9992438
## K 0.9298062 0.9232132 0.9338028 0.9528995 0.9262256 0.9263897 0.9289713
## L 0.9996434 0.9995023 0.9994671 0.9951329 0.9996322 0.9996355 0.9994842
## I J K L
## A 0.9989099 0.9992499 0.9298062 0.9996434
## B 0.9985020 0.9984263 0.9232132 0.9995023
## C 0.9993769 0.9994608 0.9338028 0.9994671
## D 0.9963505 0.9972687 0.9528995 0.9951329
## F 0.9987255 0.9989145 0.9262256 0.9996322
## G 0.9988668 0.9989707 0.9263897 0.9996355
## H 0.9987788 0.9992438 0.9289713 0.9994842
## I 1.0000000 0.9991566 0.9326316 0.9991949
## J 0.9991566 1.0000000 0.9358483 0.9992505
## K 0.9326316 0.9358483 1.0000000 0.9282693
## L 0.9991949 0.9992505 0.9282693 1.0000000
cor(cel_subsampled[,2:12])
## A B C D F G H
## A 1.0000000 0.9949779 0.9926194 0.9837366 0.9899983 0.9900933 0.9946324
## B 0.9949779 1.0000000 0.9877446 0.9633579 0.9956656 0.9945331 0.9936966
## C 0.9926194 0.9877446 1.0000000 0.9797726 0.9830161 0.9851230 0.9871799
## D 0.9837366 0.9633579 0.9797726 1.0000000 0.9561182 0.9581528 0.9720848
## F 0.9899983 0.9956656 0.9830161 0.9561182 1.0000000 0.9986640 0.9947357
## G 0.9900933 0.9945331 0.9851230 0.9581528 0.9986640 1.0000000 0.9964954
## H 0.9946324 0.9936966 0.9871799 0.9720848 0.9947357 0.9964954 1.0000000
## I 0.9927681 0.9962175 0.9832800 0.9652217 0.9965303 0.9964235 0.9974843
## J 0.9894921 0.9758013 0.9946841 0.9904788 0.9692505 0.9728345 0.9806939
## K 0.8282519 0.7801291 0.8352364 0.9036632 0.7758669 0.7840865 0.8134747
## L 0.9891756 0.9952344 0.9815004 0.9555080 0.9989862 0.9990274 0.9954751
## I J K L
## A 0.9927681 0.9894921 0.8282519 0.9891756
## B 0.9962175 0.9758013 0.7801291 0.9952344
## C 0.9832800 0.9946841 0.8352364 0.9815004
## D 0.9652217 0.9904788 0.9036632 0.9555080
## F 0.9965303 0.9692505 0.7758669 0.9989862
## G 0.9964235 0.9728345 0.7840865 0.9990274
## H 0.9974843 0.9806939 0.8134747 0.9954751
## I 1.0000000 0.9728922 0.7942730 0.9977943
## J 0.9728922 1.0000000 0.8697453 0.9679684
## K 0.7942730 0.8697453 1.0000000 0.7761718
## L 0.9977943 0.9679684 0.7761718 1.0000000
#gen 100
spo_subsampled<-subsample_counts(RNAseq_counts_gen100[which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),],4635487)
colSums(spo_subsampled[,2:12])
## A B C D F G H I J K
## 4635487 4635487 4635487 4635487 4635487 4635487 4635487 4635487 4635487 4635487
## L
## 4635487
cel_subsampled<-subsample_counts(RNAseq_counts_gen100[-which(rownames(RNAseq_counts_gen100) %in% spo_genes$V1),],50391553)
colSums(cel_subsampled[,2:12])
## A B C D F G H I
## 50391553 50391553 50391553 50391553 50391553 50391553 50391553 50391553
## J K L
## 50391553 50391553 50391553
RNAseq_counts_gen100<-rbind(cel_subsampled,spo_subsampled)
rownames(RNAseq_counts_gen100)<-RNAseq_counts_gen100$gene_id
RNAseq_counts_gen100$gene_id<-NULL
cor(RNAseq_counts_gen100)
## A B C D F G H
## A 1.0000000 0.9955098 0.9538059 0.9980639 0.9947207 0.9956476 0.9937329
## B 0.9955098 1.0000000 0.9512504 0.9976975 0.9960330 0.9983067 0.9937157
## C 0.9538059 0.9512504 1.0000000 0.9481415 0.9370806 0.9455802 0.9334888
## D 0.9980639 0.9976975 0.9481415 1.0000000 0.9973834 0.9980639 0.9950652
## F 0.9947207 0.9960330 0.9370806 0.9973834 1.0000000 0.9984806 0.9970368
## G 0.9956476 0.9983067 0.9455802 0.9980639 0.9984806 1.0000000 0.9961877
## H 0.9937329 0.9937157 0.9334888 0.9950652 0.9970368 0.9961877 1.0000000
## I 0.9895998 0.9916150 0.9415896 0.9918383 0.9955967 0.9961352 0.9933549
## J 0.9488381 0.9327833 0.9613408 0.9364191 0.9189831 0.9235615 0.9224881
## K 0.9759445 0.9734167 0.9391496 0.9742912 0.9799272 0.9787351 0.9828325
## L 0.9379715 0.9451072 0.8555066 0.9458630 0.9610838 0.9581887 0.9593733
## I J K L
## A 0.9895998 0.9488381 0.9759445 0.9379715
## B 0.9916150 0.9327833 0.9734167 0.9451072
## C 0.9415896 0.9613408 0.9391496 0.8555066
## D 0.9918383 0.9364191 0.9742912 0.9458630
## F 0.9955967 0.9189831 0.9799272 0.9610838
## G 0.9961352 0.9235615 0.9787351 0.9581887
## H 0.9933549 0.9224881 0.9828325 0.9593733
## I 1.0000000 0.9105820 0.9843009 0.9718910
## J 0.9105820 1.0000000 0.9098669 0.8035172
## K 0.9843009 0.9098669 1.0000000 0.9527860
## L 0.9718910 0.8035172 0.9527860 1.0000000
cor(spo_subsampled[,2:12])
## A B C D F G H
## A 1.0000000 0.9988037 0.9991243 0.9988951 0.9989723 0.9990197 0.9990331
## B 0.9988037 1.0000000 0.9996426 0.9997870 0.9998031 0.9998325 0.9998140
## C 0.9991243 0.9996426 1.0000000 0.9996042 0.9996790 0.9997159 0.9996564
## D 0.9988951 0.9997870 0.9996042 1.0000000 0.9997742 0.9997779 0.9997252
## F 0.9989723 0.9998031 0.9996790 0.9997742 1.0000000 0.9998192 0.9998037
## G 0.9990197 0.9998325 0.9997159 0.9997779 0.9998192 1.0000000 0.9998141
## H 0.9990331 0.9998140 0.9996564 0.9997252 0.9998037 0.9998141 1.0000000
## I 0.9991514 0.9997134 0.9995728 0.9997452 0.9997291 0.9997662 0.9997530
## J 0.9996827 0.9985986 0.9989678 0.9986562 0.9987236 0.9988002 0.9988534
## K 0.9989843 0.9997288 0.9996599 0.9997295 0.9997711 0.9998132 0.9997344
## L 0.9993178 0.9995862 0.9997277 0.9995563 0.9996613 0.9997231 0.9996332
## I J K L
## A 0.9991514 0.9996827 0.9989843 0.9993178
## B 0.9997134 0.9985986 0.9997288 0.9995862
## C 0.9995728 0.9989678 0.9996599 0.9997277
## D 0.9997452 0.9986562 0.9997295 0.9995563
## F 0.9997291 0.9987236 0.9997711 0.9996613
## G 0.9997662 0.9988002 0.9998132 0.9997231
## H 0.9997530 0.9988534 0.9997344 0.9996332
## I 1.0000000 0.9990068 0.9997136 0.9996424
## J 0.9990068 1.0000000 0.9987979 0.9991546
## K 0.9997136 0.9987979 1.0000000 0.9997278
## L 0.9996424 0.9991546 0.9997278 1.0000000
cor(cel_subsampled[,2:12])
## A B C D F G H
## A 1.0000000 0.9954809 0.9531489 0.9980550 0.9946605 0.9956371 0.9936066
## B 0.9954809 1.0000000 0.9504395 0.9976628 0.9959572 0.9982774 0.9936427
## C 0.9531489 0.9504395 1.0000000 0.9473030 0.9360149 0.9446585 0.9324381
## D 0.9980550 0.9976628 0.9473030 1.0000000 0.9973376 0.9980459 0.9949827
## F 0.9946605 0.9959572 0.9360149 0.9973376 1.0000000 0.9984574 0.9970226
## G 0.9956371 0.9982774 0.9446585 0.9980459 0.9984574 1.0000000 0.9961888
## H 0.9936066 0.9936427 0.9324381 0.9949827 0.9970226 0.9961888 1.0000000
## I 0.9896085 0.9915181 0.9406142 0.9917937 0.9955856 0.9961057 0.9934513
## J 0.9478385 0.9317838 0.9616752 0.9353211 0.9175627 0.9224023 0.9207368
## K 0.9754137 0.9729043 0.9382003 0.9737473 0.9795383 0.9783587 0.9824436
## L 0.9375908 0.9446077 0.8535153 0.9454711 0.9608715 0.9578646 0.9594044
## I J K L
## A 0.9896085 0.9478385 0.9754137 0.9375908
## B 0.9915181 0.9317838 0.9729043 0.9446077
## C 0.9406142 0.9616752 0.9382003 0.8535153
## D 0.9917937 0.9353211 0.9737473 0.9454711
## F 0.9955856 0.9175627 0.9795383 0.9608715
## G 0.9961057 0.9224023 0.9783587 0.9578646
## H 0.9934513 0.9207368 0.9824436 0.9594044
## I 1.0000000 0.9095727 0.9842027 0.9716308
## J 0.9095727 1.0000000 0.9077991 0.8012990
## K 0.9842027 0.9077991 1.0000000 0.9526735
## L 0.9716308 0.8012990 0.9526735 1.0000000
library(gplots)
heatmap.2(cor(log2(RNAseq_counts_gen25+1)))
heatmap.2(cor(log2(RNAseq_counts_gen100+1)))
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:msir':
##
## normalize
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which, which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:gplots':
##
## space
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
#coldata<-cbind(c("A","B","C","D","F","G","H","I","J","K","L"))
coldata<-cbind(rep("A",11))
rownames(coldata)<-c("A","B","C","D","F","G","H","I","J","K","L")
colnames(coldata)<-"condition"
dds <- DESeqDataSetFromMatrix(countData = RNAseq_counts_gen25,
colData = coldata,
design = ~ 1)
## converting counts to integer mode
dds <- DESeq(dds)
## Warning in DESeq(dds): the design is ~ 1 (just an intercept). is this intended?
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 246 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
plot(sizeFactors(dds),colSums(RNAseq_counts_gen25))
counts_deseqnorm<-counts(dds, normalized=TRUE)
colSums(counts_deseqnorm)
## A B C D F G H I
## 37044410 38107759 37590979 37471108 36039559 36514938 36648289 37836734
## J K L
## 36575171 42788630 37012269
#calculate cv2
#plot cv2 vs mean
#colour spo genes differently
#use spo genes to model technical noise
#call HV cel genes from the spo model
sds<-apply(counts_deseqnorm,FUN = sd,MARGIN = 1)
means<-as.numeric(apply(counts_deseqnorm,FUN = mean,MARGIN = 1))
final_dataset_gen25<-as.data.frame(counts_deseqnorm)
final_dataset_gen25$mean<-means
final_dataset_gen25$sd<-sds
final_dataset_gen25$cv<-(final_dataset_gen25$sd/final_dataset_gen25$mean)
final_dataset_gen25$cv2<-(final_dataset_gen25$sd/final_dataset_gen25$mean)**2
final_dataset_gen25$ff<-(final_dataset_gen25$sd**2)/final_dataset_gen25$mean
final_dataset_gen25$log10_mean<-log10(final_dataset_gen25$mean)
final_dataset_gen25$log10_cv2<-log10(final_dataset_gen25$cv2)
hist(final_dataset_gen25$cv2,breaks=100)
final_dataset_gen25_cel<-final_dataset_gen25[-which(rownames(final_dataset_gen25) %in% spo_genes$V1),]
final_dataset_gen25_spo<-final_dataset_gen25[which(rownames(final_dataset_gen25) %in% spo_genes$V1),]
nrow(final_dataset_gen25_cel)
## [1] 20320
nrow(final_dataset_gen25_spo)
## [1] 6984
library("scales")
plot(log10(final_dataset_gen25_cel$mean),log10(final_dataset_gen25_cel$cv2),col=alpha("lightblue",0.2),pch=19)
plot(log10(final_dataset_gen25_spo$mean),log10(final_dataset_gen25_spo$cv2),col=alpha("red",0.2),pch=19)
plot(log10(final_dataset_gen25_cel$mean),log10(final_dataset_gen25_cel$cv2),col=alpha("lightblue",0.2),pch=19)
points(log10(final_dataset_gen25_spo$mean),log10(final_dataset_gen25_spo$cv2),col=alpha("red",0.2),pch=19)
#coldata<-cbind(c("A","B","C","D","F","G","H","I","J","K","L"))
coldata<-cbind(rep("A",11))
rownames(coldata)<-c("A","B","C","D","F","G","H","I","J","K","L")
colnames(coldata)<-"condition"
dds <- DESeqDataSetFromMatrix(countData = RNAseq_counts_gen100,
colData = coldata,
design = ~ 1)
## converting counts to integer mode
dds <- DESeq(dds)
## Warning in DESeq(dds): the design is ~ 1 (just an intercept). is this intended?
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 612 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
plot(sizeFactors(dds),colSums(RNAseq_counts_gen100))
counts_deseqnorm<-counts(dds, normalized=TRUE)
colSums(counts_deseqnorm)
## A B C D F G H I
## 54887940 56642682 57451482 54905911 53447634 55184428 54035727 55699294
## J K L
## 54207816 51470616 52490887
#calculate cv2
#plot cv2 vs mean
#colour spo genes differently
#use spo genes to model technical noise
#call HV cel genes from the spo model
sds<-apply(counts_deseqnorm,FUN = sd,MARGIN = 1)
means<-as.numeric(apply(counts_deseqnorm,FUN = mean,MARGIN = 1))
final_dataset_gen100<-as.data.frame(counts_deseqnorm)
final_dataset_gen100$mean<-means
final_dataset_gen100$sd<-sds
final_dataset_gen100$cv<-(final_dataset_gen100$sd/final_dataset_gen100$mean)
final_dataset_gen100$cv2<-(final_dataset_gen100$sd/final_dataset_gen100$mean)**2
final_dataset_gen100$ff<-(final_dataset_gen100$sd**2)/final_dataset_gen100$mean
final_dataset_gen100$log10_mean<-log10(final_dataset_gen100$mean)
final_dataset_gen100$log10_cv2<-log10(final_dataset_gen100$cv2)
hist(final_dataset_gen100$cv2,breaks=100)
final_dataset_gen100_cel<-final_dataset_gen100[-which(rownames(final_dataset_gen100) %in% spo_genes$V1),]
final_dataset_gen100_spo<-final_dataset_gen100[which(rownames(final_dataset_gen100) %in% spo_genes$V1),]
nrow(final_dataset_gen100_cel)
## [1] 20320
nrow(final_dataset_gen100_spo)
## [1] 6984
library("scales")
plot(log10(final_dataset_gen100_cel$mean),log10(final_dataset_gen100_cel$cv2),col=alpha("lightblue",0.2),pch=19)
plot(log10(final_dataset_gen100_spo$mean),log10(final_dataset_gen100_spo$cv2),col=alpha("red",0.2),pch=19)
plot(log10(final_dataset_gen100_cel$mean),log10(final_dataset_gen100_cel$cv2),col=alpha("lightblue",0.2),pch=19)
points(log10(final_dataset_gen100_spo$mean),log10(final_dataset_gen100_spo$cv2),col=alpha("red",0.2),pch=19)
#fit cel data alone
Fit C. elegans counts alone, plot data together with the fit, and highlight genes with 22G-mediated epimutations to show that they are not more variable than you would expect by chance.
dat<-final_dataset_gen100_cel
#density
get_density <- function(x, y, ...) {
dens <- MASS::kde2d(x, y, ...)
ix <- findInterval(x, dens$x)
iy <- findInterval(y, dens$y)
ii <- cbind(ix, iy)
return(dens$z[ii])
}
loess_fit_and_plotres<-function(dat,thr_type,thr,return,plot1_filename,plot2_filename){
dat<-dat[order(dat$mean),]
dat<-dat[which(dat$log10_mean>0),]
dat$ID<-rownames(dat)
#fit c. elegans cv2 data
l<-loess.sd(x = dat$log10_mean,y = dat$log10_cv2, nsigma = 1.96)
l_fit<-data.frame(x=l$x,y=l$y,sd=l$sd,upper=l$upper,lower=l$lower,ID=dat$ID)
l_fit$Z<-(dat$log10_cv2-l_fit$y)/l_fit$sd
l_fit$pvalue<-pnorm(l_fit$Z,lower.tail = FALSE)
l_fit_padj<-l_fit[which(l_fit$x>0.9999999),]
print(nrow(l_fit_padj))
l_fit_padj$padj<-p.adjust(l_fit_padj$pvalue,method = "fdr")
l_fit<-merge(l_fit,l_fit_padj[,c("padj","ID")],by="ID",all.x=TRUE)
l_fit<-l_fit[order(l_fit$x),]
dat$ID<-rownames(dat)
dat<-merge(dat,l_fit[,c("x","y","upper","lower","Z","pvalue","padj","ID")],by="ID",all.x=TRUE)
rownames(dat)<-dat$ID; dat$ID<-NULL
if (thr_type=="fdr"){
dat$sig<-rep(0,nrow(dat))
dat[which(dat$padj<thr),"sig"]<-1
}else if (thr_type=="pval"){
dat$sig<-rep(0,nrow(dat))
dat[which(dat$pvalue<thr),"sig"]<-1
}
dat[which(dat$log10_mean<1),"sig"]<-0
if (return=="none"){
ggplot(dat)+geom_histogram(aes(x=pvalue))
ggplot(dat)+geom_histogram(aes(x=padj))
print(ggplot(dat)+geom_point(aes(dat$log10_mean,dat$log10_cv2),color=alpha("#8da0cb",0.2))+ scale_color_viridis()+geom_line(aes(x=l_fit$x,y=l_fit$lower),color="black",linetype="dashed")+geom_line(aes(x=l_fit$x,y=l_fit$y),color="black")+geom_line(aes(x=l_fit$x,y=l_fit$upper),color="black",linetype="dashed")+geom_point(data=subset(dat,sig==1),aes(dat[which(dat$sig==1),"log10_mean"],dat[which(dat$sig==1),"log10_cv2"]),color=alpha("#66c2a5",0.4))+ylab("log10 cv2")+xlab("log10 mean")+theme_classic())
ggsave(filename = plot1_filename,dpi=150)
dat$density<-get_density(dat$log10_mean,dat$log10_cv2, n = 100)
print(ggplot(dat)+geom_point(aes(dat$log10_mean,dat$log10_cv2,color=density))+ scale_color_viridis()+geom_line(aes(x=l_fit$x,y=l_fit$lower),color="pink",linetype="dashed")+geom_line(aes(x=l_fit$x,y=l_fit$y),color="pink")+geom_line(aes(x=l_fit$x,y=l_fit$upper),color="pink",linetype="dashed")+geom_point(data=subset(dat,sig==1),aes(dat[which(dat$sig==1),"log10_mean"],dat[which(dat$sig==1),"log10_cv2"]),color=alpha("red",0.4))+ylab("log10 cv2")+xlab("log10 mean")+theme_classic())
ggsave(filename = plot2_filename,dpi=150)
}
dat<-dat[order(dat$Z,decreasing=TRUE),]
if (return=="all"){return(dat)
}else if (return=="sig"){return(dat[which(dat$sig==1),])
}else if (return=="none"){return()}
}
Since the technical noise fits are extremely sensitive, we use instead the overall mean-noise relationship for all genes in order to identify HVGs.
loess_fit_and_plotres(final_dataset_gen25_cel,"pval",0.05,return ="none" ,"gen25_mRNA_noise.pdf","gen25_mRNA_noise_density.pdf")
## [1] 13785
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## NULL
loess_fit_and_plotres(final_dataset_gen100_cel,"pval",0.05,return = "none","gen100_mRNA_noise.pdf","gen100_mRNA_noise_density.pdf")
## [1] 14564
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## NULL
gen_25_mRNA_noise<-loess_fit_and_plotres(final_dataset_gen25_cel,"pval",0.05,"all","gen25_mRNA_noise.pdf","gen25_mRNA_noise_density.pdf")
## [1] 13785
gen_100_mRNA_noise<-loess_fit_and_plotres(final_dataset_gen100_cel,"pval",0.05,"all","gen100_mRNA_noise.pdf","gen100_mRNA_noise_density.pdf")
## [1] 14564
HV22Gs_100<-read.table("gen_100_HV22Gs_fdr1e-4.txt",header=TRUE)
HV22Gs_25<-read.table("gen_25_HV22Gs_fdr0.1.txt",header=TRUE)
length(HV22Gs_100$x)
## [1] 1087
length(HV22Gs_25$x)
## [1] 613
length(intersect(HV22Gs_100$x,HV22Gs_25$x))
## [1] 345
length(union(HV22Gs_100$x,HV22Gs_25$x))
## [1] 1355
gen_25_mRNA_noise$HV22Gs<-factor(rep(0,nrow(gen_25_mRNA_noise)),levels=c(0,1))
gen_25_mRNA_noise$HV22Gs[which(rownames(gen_25_mRNA_noise) %in% HV22Gs_25$x)]<-1
gen_100_mRNA_noise$HV22Gs<-factor(rep(0,nrow(gen_100_mRNA_noise)),levels=c(0,1))
gen_100_mRNA_noise$HV22Gs[which(rownames(gen_100_mRNA_noise) %in% HV22Gs_100$x)]<-1
ggplot(gen_25_mRNA_noise)+
geom_point(aes(x=log10_mean,y=log10_cv2),color=alpha("lightblue",0.5))+
geom_point(data=gen_25_mRNA_noise[which(gen_25_mRNA_noise$HV22Gs==1),],aes(x=log10_mean,y=log10_cv2),color=alpha("red",0.8))+
theme_classic()+
geom_line(data=gen_25_mRNA_noise,aes(x=x,y=y))+
geom_line(data=gen_25_mRNA_noise,aes(x=x,y=upper),linetype="dashed")+
geom_line(data=gen_25_mRNA_noise,aes(x=x,y=lower),linetype="dashed")
ggtitle("25th generation lines")
## $title
## [1] "25th generation lines"
##
## attr(,"class")
## [1] "labels"
ggsave("gen25_mRNA_noise_HV22Gs.png",dpi=300,device="png")
## Saving 7 x 5 in image
ggplot(gen_100_mRNA_noise)+
geom_point(aes(x=log10_mean,y=log10_cv2),color=alpha("lightblue",0.5))+
geom_point(data=gen_100_mRNA_noise[which(gen_100_mRNA_noise$HV22Gs==1),],aes(x=log10_mean,y=log10_cv2),color=alpha("red",0.8))+
theme_classic()+
geom_line(data=gen_100_mRNA_noise,aes(x=x,y=y))+
geom_line(data=gen_100_mRNA_noise,aes(x=x,y=upper),linetype="dashed")+
geom_line(data=gen_100_mRNA_noise,aes(x=x,y=lower),linetype="dashed")
ggtitle("100th generation lines")
## $title
## [1] "100th generation lines"
##
## attr(,"class")
## [1] "labels"
ggsave("gen100_mRNA_noise_HV22Gs.png",dpi=300,device="png")
## Saving 7 x 5 in image
pdf("gen25_mRNA_noise_FF_boxplots_HV22Gs.pdf")
boxplot(gen_25_mRNA_noise[which(gen_25_mRNA_noise$HV22Gs==0),"ff"],gen_25_mRNA_noise[which(gen_25_mRNA_noise$HV22Gs==1),"ff"],outline=FALSE,
ylab="mRNA Fano factor",names=c("rest of genes","HV22Gs"),main="25th generation lines")
dev.off()
## quartz_off_screen
## 2
pdf("gen100_mRNA_noise_FF_boxplots_HV22Gs.pdf")
boxplot(gen_100_mRNA_noise[which(gen_100_mRNA_noise$HV22Gs==0),"ff"],gen_100_mRNA_noise[which(gen_100_mRNA_noise$HV22Gs==1),"ff"],outline=FALSE,
ylab="mRNA Fano factor",names=c("rest of genes","HV22Gs"),main="100th generation lines")
dev.off()
## quartz_off_screen
## 2
pdf("gen25_mRNA_noise_Zscore_boxplots_HV22Gs.pdf")
boxplot(gen_25_mRNA_noise[which(gen_25_mRNA_noise$HV22Gs==0),"Z"],gen_25_mRNA_noise[which(gen_25_mRNA_noise$HV22Gs==1),"Z"],outline=FALSE,
ylab="mRNA noise Z-score",names=c("rest of genes","HV22Gs"),main="25th generation lines")
dev.off()
## quartz_off_screen
## 2
pdf("gen100_mRNA_noise_Zscore_boxplots_HV22Gs.pdf")
boxplot(gen_100_mRNA_noise[which(gen_100_mRNA_noise$HV22Gs==0),"Z"],gen_100_mRNA_noise[which(gen_100_mRNA_noise$HV22Gs==1),"Z"],outline=FALSE,
ylab="mRNA noise Z-score",names=c("rest of genes","HV22Gs"),main="100th generation lines")
dev.off()
## quartz_off_screen
## 2